Global_fishing_watch_workflow

Author

Théophile L. Mouton

1. 0.1 Degree resolution

Open R libraries

Code
# Load required libraries
library(tidyverse)
library(maps)
library(ggplot2)
library(gridExtra)
library(sf)
library(viridis)
library(scales)
library(dplyr)
library(randomForest)
library(caret)
library(pdp)

AIS data (2017-2020)

Code
# Set the path to the 2017-2020 folder

#path <- "Data/AIS Fishing Effort 2017-2020"

# List all CSV files in the folder
#AIS_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE, recursive = TRUE)

# Read all CSV files and combine them into a single data frame
#AIS_fishing <- AIS_csv_files %>%
#  map_df(~read_csv(.))

load(here::here("Data","AIS_fishing.Rdata"))

# Aggregate fishing hours by latitude and longitude
aggregated_AIS_fishing <- AIS_fishing %>%
  group_by(cell_ll_lat, cell_ll_lon) %>%
  summarise(total_fishing_hours = sum(fishing_hours, na.rm = TRUE)) %>%
  ungroup() %>%
  # Remove any cells with zero or negative fishing hours
  filter(total_fishing_hours > 0)

# Function to standardize coordinates to 0.1 degree resolution
standardize_coords <- function(lon, lat) {
  list(
    lon_std = floor(lon * 10) / 10,
    lat_std = floor(lat * 10) / 10
  )
}

# Standardize and aggregate AIS data
AIS_data_std <- aggregated_AIS_fishing %>%
  mutate(coords = map2(cell_ll_lon, cell_ll_lat, standardize_coords)) %>%
  mutate(
    lon_std = map_dbl(coords, ~ .x$lon_std),
    lat_std = map_dbl(coords, ~ .x$lat_std)
  ) %>%
  group_by(lon_std, lat_std) %>%
  summarise(total_fishing_hours = sum(total_fishing_hours, na.rm = TRUE), .groups = "drop")

# Create the world map
world_map <- map_data("world")

# Create the plot
ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long = long, lat = lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = AIS_data_std, 
            aes(x = lon_std, y = lat_std, fill = total_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "AIS Fishing Effort (hours; 2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = NULL,
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

Sentinel-1 SAR data (2017-2020)

Code
# Set the path to the 2016 folder
#path <- "Data/SAR Vessel detections 2017-2020"

# List all CSV files in the folder
#SAR_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE)

# Read all CSV files and combine them into a single data frame
#SAR_fishing <- SAR_csv_files %>%
#  map_df(~read_csv(.))

load(here::here("Data","SAR_fishing.Rdata"))

# Aggregate fishing hours by latitude and longitude
aggregated_SAR_fishing <- SAR_fishing %>%
  mutate(
    lat_rounded = round(lat, digits = 2),
    lon_rounded = round(lon, digits = 2)
  ) %>%
  group_by(lat_rounded, lon_rounded) %>%
  filter(fishing_score >= 0.9) %>%
  summarise(
    total_presence_score = sum(presence_score, na.rm = TRUE),
    avg_fishing_score = mean(fishing_score, na.rm = TRUE),
    count = n()
  ) %>%
  mutate(total_presence_score = round(total_presence_score, digits = 0)) %>%
  ungroup()

# Standardize and aggregate SAR data
SAR_data_std <- aggregated_SAR_fishing %>%
  mutate(coords = map2(lon_rounded, lat_rounded, standardize_coords)) %>%
  mutate(
    lon_std = map_dbl(coords, ~ .x$lon_std),
    lat_std = map_dbl(coords, ~ .x$lat_std)
  ) %>%
  group_by(lon_std, lat_std) %>%
  summarise(total_presence_score = sum(total_presence_score, na.rm = TRUE), .groups = "drop")

# Create the world map
world_map <- map_data("world")

# Create the plot
ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long = long, lat = lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = SAR_data_std, 
            aes(x = lon_std, y = lat_std, fill = total_presence_score)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "Fishing vessel detections (2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = NULL,
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

Compare AIS, and SAR detection locations

Code
# Merge the datasets
combined_data <- full_join(
  AIS_data_std,
  SAR_data_std,
  by = c("lon_std", "lat_std")
)

# Categorize each cell
combined_data <- combined_data %>%
  mutate(category = case_when(
    total_fishing_hours > 0 & total_presence_score > 0 ~ "Both AIS and SAR",
    total_fishing_hours > 0 & (is.na(total_presence_score) | total_presence_score == 0) ~ "Only AIS",
    (is.na(total_fishing_hours) | total_fishing_hours == 0) & total_presence_score > 0 ~ "Only SAR",
    TRUE ~ "No fishing detected"
  ))

# Create the world map
world_map <- map_data("world")

# Create the plot
world_plot <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long = long, lat = lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = combined_data, 
            aes(x = lon_std, y = lat_std, fill = category)) +
  scale_fill_manual(
    values = c("Both AIS and SAR" = "purple", 
               "Only AIS" = "blue", 
               "Only SAR" = "red", 
               "No fishing detected" = "white"),
    name = "Fishing Data Source",
    guide = guide_legend(title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Global Fishing Detection",
       subtitle = "Comparison of AIS (2017-2020) and SAR (2017-2020) data at 0.1-degree resolution",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

# Display the plot
print(world_plot)

Code
# Get summary statistics
summary_stats <- combined_data %>% 
  count(category) %>% 
  mutate(percentage = n / sum(n) * 100)

print(summary_stats)
# A tibble: 3 × 3
  category               n percentage
  <chr>              <int>      <dbl>
1 Both AIS and SAR  163095       9.12
2 Only AIS         1566190      87.6 
3 Only SAR           58668       3.28

Random forest model

Code
#---- Open the AIS data 

#load(here::here("Data", "AIS_fishing.Rdata"))

# Aggregate fishing hours by latitude and longitude
aggregated_AIS_fishing <- AIS_fishing %>%
  group_by(cell_ll_lat, cell_ll_lon) %>%
  summarise(total_fishing_hours = sum(fishing_hours, na.rm = TRUE)) %>%
  ungroup() %>%
  # Remove any cells with zero or negative fishing hours
  filter(total_fishing_hours > 0)

#---- Open the SAR data 

#load(here::here("Data", "SAR_fishing.Rdata"))

# Aggregate fishing hours by latitude and longitude
aggregated_SAR_fishing <- SAR_fishing %>%
  mutate(
    lat_rounded = round(lat, digits = 2),
    lon_rounded = round(lon, digits = 2)
  ) %>%
  #  filter(matched_category == "fishing") %>%
  group_by(lat_rounded, lon_rounded) %>%
  filter(fishing_score >= 0.9) %>%
  summarise(
    total_presence_score = sum(presence_score, na.rm = TRUE),
    avg_fishing_score = mean(fishing_score, na.rm = TRUE),
    count = n()
  ) %>%
  mutate(total_presence_score = round(total_presence_score, digits = 0)) %>%
  ungroup()


# Function to standardize coordinates to 0.1 degree resolution
standardize_coords <- function(lon, lat) {
  list(
    lon_std = floor(lon * 10) / 10,
    lat_std = floor(lat * 10) / 10
  )
}

# Aggregate AIS data to 0.1 degree resolution
AIS_data_01deg <- aggregated_AIS_fishing %>%
  mutate(coords = purrr::map2(cell_ll_lon, cell_ll_lat, standardize_coords)) %>%
  mutate(
    lon_std = sapply(coords, function(x) x$lon_std),
    lat_std = sapply(coords, function(x) x$lat_std)
  ) %>%
  group_by(lon_std, lat_std) %>%
  summarise(total_fishing_hours = sum(total_fishing_hours, na.rm = TRUE), .groups = "drop")

# Aggregate SAR data to 0.1 degree resolution
SAR_data_01deg <- aggregated_SAR_fishing %>%
  mutate(coords = purrr::map2(lon_rounded, lat_rounded, standardize_coords)) %>%
  mutate(
    lon_std = sapply(coords, function(x) x$lon_std),
    lat_std = sapply(coords, function(x) x$lat_std)
  ) %>%
  group_by(lon_std, lat_std) %>%
  summarise(total_presence_score = sum(total_presence_score, na.rm = TRUE), .groups = "drop")

# Merge the datasets
combined_data_01deg <- full_join(
  AIS_data_01deg,
  SAR_data_01deg,
  by = c("lon_std", "lat_std")
) %>%
  mutate(
    has_AIS = !is.na(total_fishing_hours) & total_fishing_hours > 0,
    has_SAR = !is.na(total_presence_score) & total_presence_score > 0
  )

# Separate the data into training (both AIS and SAR) and prediction (only SAR) sets
training_data <- combined_data_01deg %>%
  filter(has_AIS & has_SAR) %>%
  select(total_fishing_hours, total_presence_score, lon_std, lat_std)

prediction_data <- combined_data_01deg %>%
  filter(!has_AIS & has_SAR) %>%
  select(total_presence_score, lon_std, lat_std)

# Train the random forest model with timing
#set.seed(123)  # for reproducibility
#rf_timing <- system.time({
#  rf_model_01deg <- randomForest(
#    total_fishing_hours ~ total_presence_score + lon_std + lat_std,
#    data = training_data,
#    ntree = 500,
#    importance = TRUE
#  )
#})

# Print the time taken
#print(paste("Random Forest model training time:", rf_timing["elapsed"], "seconds"))

#save(rf_model_01deg,file="rf_model_01deg.Rdata")
load(here::here("rf_model_01deg.Rdata"))

#Visualise results 
# Make predictions
predictions <- predict(rf_model, newdata = prediction_data)

# Add predictions to the original dataset
combined_data_01deg <- combined_data_01deg %>%
  mutate(
    predicted_fishing_hours = case_when(
      has_AIS ~ total_fishing_hours,
      has_SAR ~ predict(rf_model, newdata = select(., total_presence_score, lon_std, lat_std)),
      TRUE ~ 0
    )
  )

# Create the world map
world_map <- map_data("world")

#Map of predicted fishing hours only 
# Prepare the data for the map
map_data <- combined_data_01deg %>%
  filter(!has_AIS & has_SAR) %>%
  select(lon_std, lat_std, predicted_fishing_hours)

# Create the map
predicted_SAR_only_plot <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long, lat, map_id = region),
           color = "darkgray", fill = "lightgray", size = 0.1) +
  geom_tile(data = map_data, 
            aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "Predicted fishing hours (2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Predicted Fishing Hours in Areas with Only SAR Detections",
       subtitle = "0.1 degree resolution",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

# Print the map
print(predicted_SAR_only_plot)

Code
#Map of both original and predicted AIS fishing hours 
# Visualize the results
predicted_plot <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long, lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = combined_data_01deg, 
            aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "Predicted fishing hours (2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Global Predicted Fishing Hours (0.1 degree resolution)",
       subtitle = "Based on AIS data and Random Forest predictions from SAR data",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

print(predicted_plot)

Code
#Aggregate data to 1 degree resolution 
# First, round the coordinates to the nearest degree
combined_data_1deg <- combined_data_01deg %>%
  mutate(
    lon_1deg = round(lon_std),
    lat_1deg = round(lat_std)
  )

# Aggregate the data
aggregated_data <- combined_data_1deg %>%
  group_by(lon_1deg, lat_1deg) %>%
  summarise(
    predicted_fishing_hours = sum(predicted_fishing_hours, na.rm = TRUE),
    .groups = 'drop'
  )

# Create the world map
world_map <- map_data("world")

# Create the map
predicted_plot_1deg <- ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(long, lat, map_id = region),
           color = "black", fill = "lightgray", size = 0.1) +
  geom_tile(data = aggregated_data, 
            aes(x = lon_1deg, y = lat_1deg, fill = predicted_fishing_hours)) +
  scale_fill_viridis(
    option = "inferno",
    direction = -1,
    trans = "log1p",
    name = "Predicted fishing hours (2017-2020)", 
    breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
    labels = scales::comma,
    guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
  ) +
  coord_fixed(1.3) +
  theme_minimal() +
  labs(title = "Global Predicted Fishing Hours (1 degree resolution)",
       subtitle = "Based on AIS data and Random Forest predictions from SAR data made at 0.1 degree resolution",
       x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
    legend.title = element_text(margin = ggplot2::margin(b = 10))
  )

# Print the map
print(predicted_plot_1deg)

Code
# Evaluate the model
# Function to evaluate models
evaluate_model <- function(model, data, log_target = FALSE) {
  predictions <- predict(model, newdata = data)
  if (log_target) {
    predictions <- 10^predictions - 1
  }
  
  actual <- data$total_fishing_hours
  
  # Basic Error Metrics
  mae <- mean(abs(actual - predictions), na.rm = TRUE)
  rmse <- sqrt(mean((actual - predictions)^2, na.rm = TRUE))
  mape <- mean(abs((actual - predictions) / actual) * 100, na.rm = TRUE)
  medae <- median(abs(actual - predictions), na.rm = TRUE)
  
  # R-squared and Adjusted R-squared
  ss_total <- sum((actual - mean(actual))^2, na.rm = TRUE)
  ss_residual <- sum((actual - predictions)^2, na.rm = TRUE)
  r_squared <- 1 - (ss_residual / ss_total)
  n <- length(actual)
  p <- length(model$forest$independent.variable.names) # Number of predictors
  adj_r_squared <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
  
  # Residual Analysis
  residuals <- actual - predictions
  mean_residual <- mean(residuals, na.rm = TRUE)
  sd_residual <- sd(residuals, na.rm = TRUE)
  
  # Feature Importance (for Random Forest)
  feature_importance <- importance(model)
  
  return(list(
    MAE = mae,                            # Mean Absolute Error
    RMSE = rmse,                          # Root Mean Squared Error
    MAPE = mape,                          # Mean Absolute Percentage Error
    MedAE = medae,                        # Median Absolute Error
    R_squared = r_squared,                # R-Squared (Coefficient of Determination)
    Adjusted_R_squared = adj_r_squared,   # Adjusted R-Squared
    Mean_Residual = mean_residual,        # Mean of Residuals
    SD_Residual = sd_residual,            # Standard Deviation of Residuals
    Feature_Importance = feature_importance # Feature Importance (for Random Forest)
  ))
}

# Merge the datasets
combined_data_01deg <- full_join(
  AIS_data_01deg,
  SAR_data_01deg,
  by = c("lon_std", "lat_std")
) %>%
  mutate(
    has_AIS = !is.na(total_fishing_hours) & total_fishing_hours > 0,
    has_SAR = !is.na(total_presence_score) & total_presence_score > 0,
    data_category = case_when(
      has_AIS & has_SAR ~ "Both AIS and SAR",
      has_AIS & !has_SAR ~ "Only AIS",
      !has_AIS & has_SAR ~ "Only SAR",
      TRUE ~ "No fishing detected"
    )
  )

# Evaluate all models
validation_data <- combined_data_01deg %>% filter(data_category == "Both AIS and SAR")
results_no_transform <- evaluate_model(rf_model, validation_data)
print(results_no_transform)
$MAE
[1] 275.048

$RMSE
[1] 1177.043

$MAPE
[1] 1965.5

$MedAE
[1] 55.19061

$R_squared
[1] 0.9165729

$Adjusted_R_squared
[1] 0.9165729

$Mean_Residual
[1] -0.1617396

$SD_Residual
[1] 1177.047

$Feature_Importance
                      %IncMSE IncNodePurity
total_presence_score 98.83689  987162725907
lon_std              43.65497  791743926811
lat_std              38.67563  696033312224

Model evaluation interpretation:

Strengths: The model explains a large portion of the variance in the data (high R² and Adjusted R²), and the median error (MedAE) is reasonably low, suggesting that the model is accurate for many of the predictions.

Weaknesses: The high RMSE, MAPE, and SD of residuals indicate the presence of some large errors and possible outliers that are skewing the results. The very high MAPE is particularly concerning, suggesting that the model may struggle with predictions for certain instances, possibly due to the nature of the data (e.g., low actual values leading to large percentage errors).

Feature Importance: The model relies heavily on the total_presence_score feature, with location features also playing significant roles.

To improve the model, consider investigating the outliers and errors, potentially refining the model or using alternative modeling approaches that might handle these cases better.